#coding:utf-8
import codecs
import json
import os
import jieba
import re

stopwords = codecs.open('data/stopwords','r',encoding='utf8').readlines()
stopwords = [ w.strip() for w in stopwords ]

def readFile(filename):
    fd=codecs.open(filename,"r")
    lines=fd.readlines()
    fd.close()
    return lines
def writeWords(lines,filename):
    file=codecs.open(filename,"a")
    for line in lines:
        if line.strip().lstrip()=="此用户未填写评价内容":
          continue
        wordList=jieba.cut(line)
        words=[]
        for word in wordList:
            if re.search(r'[a-zA-Z0-9]+', word,re.M|re.I):
               continue
            if re.search("^[,\.\!?\/\\:;\"\'_+=-@#$%^&*()<>《》，。？、：；“”‘’！@……（）——+=]+$",word,re.M|re.I):
               continue
            if word not in stopwords:
               words.append(word.encode("utf8"))
        #print json.dumps(words,encoding='utf8',ensure_ascii=False)
        if len(words)>3:
         file.write(" ".join(words))

if __name__ == '__main__':
    path = os.listdir(r"D:\pythonGit\JFB\word2vec\comments")
    filename="data/yuliao2.txt"
    for p in path:
        lines=readFile(u"comments/"+p)
        writeWords(lines,filename)
